-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[LLD][X86] Match delayLoad thunk with MSVC #149521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Previously we saved registers in the shadow space of callee before calling __delayLoadHelper2. Now we save arguments in the shadow space of the caller and allocate shadow space for the callee. Fixes llvm#51941 Co-authored-by: Benjamin Santerre <[email protected]>
@llvm/pr-subscribers-platform-windows @llvm/pr-subscribers-lld Author: Evgenii Kudriashov (e-kud) ChangesPreviously we saved registers in the shadow space of callee before calling __delayLoadHelper2. Now we save arguments in the shadow space of the caller and allocate shadow space for the callee. Fixes #51941 Full diff: https://github.com/llvm/llvm-project/pull/149521.diff 3 Files Affected:
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index c327da28ce138..d98126fea7234 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -244,28 +244,28 @@ static const uint8_t thunkX64[] = {
};
static const uint8_t tailMergeX64[] = {
- 0x51, // push rcx
- 0x52, // push rdx
- 0x41, 0x50, // push r8
- 0x41, 0x51, // push r9
- 0x48, 0x83, 0xEC, 0x48, // sub rsp, 48h
- 0x66, 0x0F, 0x7F, 0x04, 0x24, // movdqa xmmword ptr [rsp], xmm0
- 0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x10, // movdqa xmmword ptr [rsp+10h], xmm1
- 0x66, 0x0F, 0x7F, 0x54, 0x24, 0x20, // movdqa xmmword ptr [rsp+20h], xmm2
- 0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x30, // movdqa xmmword ptr [rsp+30h], xmm3
- 0x48, 0x8B, 0xD0, // mov rdx, rax
- 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
- 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
- 0x66, 0x0F, 0x6F, 0x04, 0x24, // movdqa xmm0, xmmword ptr [rsp]
- 0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x10, // movdqa xmm1, xmmword ptr [rsp+10h]
- 0x66, 0x0F, 0x6F, 0x54, 0x24, 0x20, // movdqa xmm2, xmmword ptr [rsp+20h]
- 0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x30, // movdqa xmm3, xmmword ptr [rsp+30h]
- 0x48, 0x83, 0xC4, 0x48, // add rsp, 48h
- 0x41, 0x59, // pop r9
- 0x41, 0x58, // pop r8
- 0x5A, // pop rdx
- 0x59, // pop rcx
- 0xFF, 0xE0, // jmp rax
+ 0x48, 0x89, 0x4C, 0x24, 0x08, // mov qword ptr [rsp+8], rcx
+ 0x48, 0x89, 0x54, 0x24, 0x10, // mov qword ptr [rsp+10h], rdx
+ 0x4C, 0x89, 0x44, 0x24, 0x18, // mov qword ptr [rsp+18h], r8
+ 0x4C, 0x89, 0x4C, 0x24, 0x20, // mov qword ptr [rsp+20h], r9
+ 0x48, 0x83, 0xEC, 0x68, // sub rsp, 68h
+ 0x66, 0x0F, 0x7F, 0x44, 0x24, 0x20, // movdqa xmmword ptr [rsp+20h], xmm0
+ 0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x30, // movdqa xmmword ptr [rsp+30h], xmm1
+ 0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40, // movdqa xmmword ptr [rsp+40h], xmm2
+ 0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50, // movdqa xmmword ptr [rsp+50h], xmm3
+ 0x48, 0x8B, 0xD0, // mov rdx, rax
+ 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
+ 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
+ 0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20, // movdqa xmm0, xmmword ptr [rsp+20h]
+ 0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30, // movdqa xmm1, xmmword ptr [rsp+30h]
+ 0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40, // movdqa xmm2, xmmword ptr [rsp+40h]
+ 0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50, // movdqa xmm3, xmmword ptr [rsp+50h]
+ 0x48, 0x8B, 0x4C, 0x24, 0x70, // mov rcx, qword ptr [rsp+70h]
+ 0x48, 0x8B, 0x54, 0x24, 0x78, // mov rdx, qword ptr [rsp+78h]
+ 0x4C, 0x8B, 0x84, 0x24, 0x80, 00, 00, 00, // mov r8, qword ptr [rsp+80h]
+ 0x4C, 0x8B, 0x8C, 0x24, 0x88, 00, 00, 00, // mov r9, qword ptr [rsp+88h]
+ 0x48, 0x83, 0xC4, 0x68, // add rsp, 68h
+ 0xFF, 0xE0, // jmp rax
};
static const uint8_t tailMergeUnwindInfoX64[] = {
@@ -378,8 +378,8 @@ class TailMergeChunkX64 : public NonSectionCodeChunk {
void writeTo(uint8_t *buf) const override {
memcpy(buf, tailMergeX64, sizeof(tailMergeX64));
- write32le(buf + 39, desc->getRVA() - rva - 43);
- write32le(buf + 44, helper->getRVA() - rva - 48);
+ write32le(buf + 54, desc->getRVA() - rva - 58);
+ write32le(buf + 59, helper->getRVA() - rva - 63);
}
Chunk *desc = nullptr;
diff --git a/lld/test/COFF/delayimports.test b/lld/test/COFF/delayimports.test
index f410eef35fd1d..1521155d8764f 100644
--- a/lld/test/COFF/delayimports.test
+++ b/lld/test/COFF/delayimports.test
@@ -44,7 +44,7 @@ BASEREL-NEXT: }
UNWIND: UnwindInformation [
UNWIND-NEXT: RuntimeFunction {
UNWIND-NEXT: StartAddress: (0x14000108A)
-UNWIND-NEXT: EndAddress: (0x1400010DD)
+UNWIND-NEXT: EndAddress: (0x140001101)
UNWIND-NEXT: UnwindInfoAddress: (0x140002000)
UNWIND-NEXT: UnwindInfo {
UNWIND-NEXT: Version: 1
diff --git a/lld/test/COFF/delayimporttables.yaml b/lld/test/COFF/delayimporttables.yaml
index cf54c0a7140a1..a4e0cf0cc992c 100644
--- a/lld/test/COFF/delayimporttables.yaml
+++ b/lld/test/COFF/delayimporttables.yaml
@@ -37,11 +37,11 @@
# CHECK-NEXT: UnloadDelayImportTable: 0x0
# CHECK-NEXT: Import {
# CHECK-NEXT: Symbol: left (0)
-# CHECK-NEXT: Address: 0x1400010B8
+# CHECK-NEXT: Address: 0x1400010DC
# CHECK-NEXT: }
# CHECK-NEXT: Import {
# CHECK-NEXT: Symbol: right (0)
-# CHECK-NEXT: Address: 0x1400010C4
+# CHECK-NEXT: Address: 0x1400010E8
# CHECK-NEXT: }
# CHECK-NEXT: }
|
@llvm/pr-subscribers-lld-coff Author: Evgenii Kudriashov (e-kud) ChangesPreviously we saved registers in the shadow space of callee before calling __delayLoadHelper2. Now we save arguments in the shadow space of the caller and allocate shadow space for the callee. Fixes #51941 Full diff: https://github.com/llvm/llvm-project/pull/149521.diff 3 Files Affected:
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index c327da28ce138..d98126fea7234 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -244,28 +244,28 @@ static const uint8_t thunkX64[] = {
};
static const uint8_t tailMergeX64[] = {
- 0x51, // push rcx
- 0x52, // push rdx
- 0x41, 0x50, // push r8
- 0x41, 0x51, // push r9
- 0x48, 0x83, 0xEC, 0x48, // sub rsp, 48h
- 0x66, 0x0F, 0x7F, 0x04, 0x24, // movdqa xmmword ptr [rsp], xmm0
- 0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x10, // movdqa xmmword ptr [rsp+10h], xmm1
- 0x66, 0x0F, 0x7F, 0x54, 0x24, 0x20, // movdqa xmmword ptr [rsp+20h], xmm2
- 0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x30, // movdqa xmmword ptr [rsp+30h], xmm3
- 0x48, 0x8B, 0xD0, // mov rdx, rax
- 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
- 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
- 0x66, 0x0F, 0x6F, 0x04, 0x24, // movdqa xmm0, xmmword ptr [rsp]
- 0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x10, // movdqa xmm1, xmmword ptr [rsp+10h]
- 0x66, 0x0F, 0x6F, 0x54, 0x24, 0x20, // movdqa xmm2, xmmword ptr [rsp+20h]
- 0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x30, // movdqa xmm3, xmmword ptr [rsp+30h]
- 0x48, 0x83, 0xC4, 0x48, // add rsp, 48h
- 0x41, 0x59, // pop r9
- 0x41, 0x58, // pop r8
- 0x5A, // pop rdx
- 0x59, // pop rcx
- 0xFF, 0xE0, // jmp rax
+ 0x48, 0x89, 0x4C, 0x24, 0x08, // mov qword ptr [rsp+8], rcx
+ 0x48, 0x89, 0x54, 0x24, 0x10, // mov qword ptr [rsp+10h], rdx
+ 0x4C, 0x89, 0x44, 0x24, 0x18, // mov qword ptr [rsp+18h], r8
+ 0x4C, 0x89, 0x4C, 0x24, 0x20, // mov qword ptr [rsp+20h], r9
+ 0x48, 0x83, 0xEC, 0x68, // sub rsp, 68h
+ 0x66, 0x0F, 0x7F, 0x44, 0x24, 0x20, // movdqa xmmword ptr [rsp+20h], xmm0
+ 0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x30, // movdqa xmmword ptr [rsp+30h], xmm1
+ 0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40, // movdqa xmmword ptr [rsp+40h], xmm2
+ 0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50, // movdqa xmmword ptr [rsp+50h], xmm3
+ 0x48, 0x8B, 0xD0, // mov rdx, rax
+ 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
+ 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
+ 0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20, // movdqa xmm0, xmmword ptr [rsp+20h]
+ 0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30, // movdqa xmm1, xmmword ptr [rsp+30h]
+ 0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40, // movdqa xmm2, xmmword ptr [rsp+40h]
+ 0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50, // movdqa xmm3, xmmword ptr [rsp+50h]
+ 0x48, 0x8B, 0x4C, 0x24, 0x70, // mov rcx, qword ptr [rsp+70h]
+ 0x48, 0x8B, 0x54, 0x24, 0x78, // mov rdx, qword ptr [rsp+78h]
+ 0x4C, 0x8B, 0x84, 0x24, 0x80, 00, 00, 00, // mov r8, qword ptr [rsp+80h]
+ 0x4C, 0x8B, 0x8C, 0x24, 0x88, 00, 00, 00, // mov r9, qword ptr [rsp+88h]
+ 0x48, 0x83, 0xC4, 0x68, // add rsp, 68h
+ 0xFF, 0xE0, // jmp rax
};
static const uint8_t tailMergeUnwindInfoX64[] = {
@@ -378,8 +378,8 @@ class TailMergeChunkX64 : public NonSectionCodeChunk {
void writeTo(uint8_t *buf) const override {
memcpy(buf, tailMergeX64, sizeof(tailMergeX64));
- write32le(buf + 39, desc->getRVA() - rva - 43);
- write32le(buf + 44, helper->getRVA() - rva - 48);
+ write32le(buf + 54, desc->getRVA() - rva - 58);
+ write32le(buf + 59, helper->getRVA() - rva - 63);
}
Chunk *desc = nullptr;
diff --git a/lld/test/COFF/delayimports.test b/lld/test/COFF/delayimports.test
index f410eef35fd1d..1521155d8764f 100644
--- a/lld/test/COFF/delayimports.test
+++ b/lld/test/COFF/delayimports.test
@@ -44,7 +44,7 @@ BASEREL-NEXT: }
UNWIND: UnwindInformation [
UNWIND-NEXT: RuntimeFunction {
UNWIND-NEXT: StartAddress: (0x14000108A)
-UNWIND-NEXT: EndAddress: (0x1400010DD)
+UNWIND-NEXT: EndAddress: (0x140001101)
UNWIND-NEXT: UnwindInfoAddress: (0x140002000)
UNWIND-NEXT: UnwindInfo {
UNWIND-NEXT: Version: 1
diff --git a/lld/test/COFF/delayimporttables.yaml b/lld/test/COFF/delayimporttables.yaml
index cf54c0a7140a1..a4e0cf0cc992c 100644
--- a/lld/test/COFF/delayimporttables.yaml
+++ b/lld/test/COFF/delayimporttables.yaml
@@ -37,11 +37,11 @@
# CHECK-NEXT: UnloadDelayImportTable: 0x0
# CHECK-NEXT: Import {
# CHECK-NEXT: Symbol: left (0)
-# CHECK-NEXT: Address: 0x1400010B8
+# CHECK-NEXT: Address: 0x1400010DC
# CHECK-NEXT: }
# CHECK-NEXT: Import {
# CHECK-NEXT: Symbol: right (0)
-# CHECK-NEXT: Address: 0x1400010C4
+# CHECK-NEXT: Address: 0x1400010E8
# CHECK-NEXT: }
# CHECK-NEXT: }
|
Should there be any specific tests? I'm not very familiar with LLD. CC @DeChambord |
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp -- lld/COFF/DLL.cpp View the diff from clang-format here.diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 3ce8853ad..ffc0969d3 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -254,18 +254,18 @@ static const uint8_t tailMergeX64[] = {
0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40, // movdqa xmmword ptr [rsp+40h], xmm2
0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50, // movdqa xmmword ptr [rsp+50h], xmm3
0x48, 0x8B, 0xD0, // mov rdx, rax
- 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
- 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
+ 0x48, 0x8D, 0x0D, 0, 0, 0, 0, // lea rcx, [___DELAY_IMPORT_...]
+ 0xE8, 0, 0, 0, 0, // call __delayLoadHelper2
0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20, // movdqa xmm0, xmmword ptr [rsp+20h]
0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30, // movdqa xmm1, xmmword ptr [rsp+30h]
0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40, // movdqa xmm2, xmmword ptr [rsp+40h]
0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50, // movdqa xmm3, xmmword ptr [rsp+50h]
0x48, 0x8B, 0x4C, 0x24, 0x70, // mov rcx, qword ptr [rsp+70h]
0x48, 0x8B, 0x54, 0x24, 0x78, // mov rdx, qword ptr [rsp+78h]
- 0x4C, 0x8B, 0x84, 0x24, 0x80, 0, 0, 0, // mov r8, qword ptr [rsp+80h]
- 0x4C, 0x8B, 0x8C, 0x24, 0x88, 0, 0, 0, // mov r9, qword ptr [rsp+88h]
- 0x48, 0x83, 0xC4, 0x68, // add rsp, 68h
- 0xFF, 0xE0, // jmp rax
+ 0x4C, 0x8B, 0x84, 0x24, 0x80, 0, 0, 0, // mov r8, qword ptr [rsp+80h]
+ 0x4C, 0x8B, 0x8C, 0x24, 0x88, 0, 0, 0, // mov r9, qword ptr [rsp+88h]
+ 0x48, 0x83, 0xC4, 0x68, // add rsp, 68h
+ 0xFF, 0xE0, // jmp rax
};
static const uint8_t tailMergeUnwindInfoX64[] = {
|
Thank you for looking at it. I'm not familiar with the LLD project either, but in essence this issue shows we're missing converage on floating point arguments. Here we want to test that a delayload'ed function such as __declspec(dllexport) void foo(double one, double two) {
ASSERT_EQ(one, 1.0);
ASSERT_EQ(two, 2.0);
} called with : __declspec(dllimport) void foo(double one, double two);
void main () {
foo(1.0, 2.0);
} Works on the target machine. On the current implementation, the 2nd equality assert would fail on windows. |
I think that |
@cjacek thanks! Synced it with |
@DeChambord thank you for the smallest reproducer. But I wasn't able to find any existing tests for |
@cjacek ping |
lld/COFF/DLL.cpp
Outdated
0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50, // movdqa xmm3, xmmword ptr [rsp+50h] | ||
0x48, 0x8B, 0x4C, 0x24, 0x70, // mov rcx, qword ptr [rsp+70h] | ||
0x48, 0x8B, 0x54, 0x24, 0x78, // mov rdx, qword ptr [rsp+78h] | ||
0x4C, 0x8B, 0x84, 0x24, 0x80, 00, 00, 00, // mov r8, qword ptr [rsp+80h] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's unfortunate that it doesn't pass clang-format, but the original code didn't either, and I don't see a good way to improve it. Please use 0 instead of 00 for those constants. It's more consistent and helps keep within the 80-column limit.
Other than that, looks good to me, thanks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! This and an extra space after opcodes helped to fit it into 80 chars.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
This looks like a fix that we should consider backporting to the 21.x release branch. |
/cherry-pick 75b79c9 |
/pull-request #151307 |
Previously we saved registers in the shadow space of callee before calling __delayLoadHelper2. Now we save arguments in the shadow space of the caller and allocate shadow space for the callee. Fixes llvm#51941 --------- Co-authored-by: Benjamin Santerre <[email protected]> (cherry picked from commit 75b79c9)
Previously we saved registers in the shadow space of callee before calling __delayLoadHelper2. Now we save arguments in the shadow space of the caller and allocate shadow space for the callee.
Fixes #51941